#########################
# 3. CFscore imputation and validation
#########################


# load packages
library(MALDIquant)
library(randomForest)
library(dplyr)
library(missForest)
library(haven)
library(ggplot2)

# load data
load('1_output.RData')


###

# 1. Prepare for RF imputation

# create variables that condition issues --> ideology link
cces12$donor <- ifelse(complete.cases(cces12$true_CFscore), 1, 0)
cces12$over100k <- ifelse(cces12$faminc < 10, 'no',
                          ifelse(cces12$faminc < 97, 'yes', 'missing'))
cces12$knowledge <- ifelse(cces12$CC309a == 1 & cces12$CC309b == 2, 1,
                            ifelse(cces12$CC309a == 1 | cces12$CC309b == 2, 0.5, 0))

# create model formula
temp <- paste(c(policy_chars,'over100k','birthyr','knowledge'), collapse=" + ")
formula_full <- as.formula(paste("true_CFscore", temp, sep = " ~ "))


###

# 2. RF imputation of CFscores for non-donors

set.seed(1865)
rf <- randomForest(formula_full,
                          data=cces12[cces12$donor == 1,],ntree=1000,nodesize=10,mtry = 2,
                          na.action = na.omit,keep.inbag=T)
cces12$imputed_CFscore <-  predict(rf, newdata = cces12)


###

# 3. Distribution of CFscores for donors and non-donors (Table 3)

quantile(cces12$true_CFscore[cces12$party_id == 'Democrat'], probs = seq(0,1,.05), na.rm = T)
quantile(cces12$true_CFscore[cces12$party_id == 'Independent'], probs = seq(0,1,.05), na.rm = T)
quantile(cces12$true_CFscore[cces12$party_id == 'Republican'], probs = seq(0,1,.05), na.rm = T)

quantile(cces12$imputed_CFscore[cces12$party_id == 'Democrat' & cces12$donor == 0], probs = seq(0,1,.05), na.rm = T)
quantile(cces12$imputed_CFscore[cces12$party_id == 'Independent' & cces12$donor == 0], probs = seq(0,1,.05), na.rm = T)
quantile(cces12$imputed_CFscore[cces12$party_id == 'Republican' & cces12$donor == 0], probs = seq(0,1,.05), na.rm = T)


###

# 4. Validate imputed CFscores for non-donors

# a. Create relevant variables

# ideological self-placement
cces12$ideo5 <- ifelse(cces12$ideo5 > 5, 3, cces12$ideo5) # adjust ideo and pid to standard

# unidimensional scaling of policy views
lm(ideo5 ~ budgetprefs, data = cces12) # (figure out what budget preferences are more conservative)
cces12$conprefs <- ifelse(cces12$budgetprefs == "2 1" | cces12$budgetprefs == "2 3", 1, 0)
policies[25] <- 'conprefs'
pca <- princomp(na.omit(cces12[,policies])) # create PCA scaling
cces12$pca1[complete.cases(cces12[policies])] <- pca$scores[,1]


# b. Calculate correlations between imputed CFscores and others
cor.test(~ imputed_CFscore + ideo5, data = cces12[cces12$donor == 0,]) #0.69
cor.test(~ imputed_CFscore + pca1, data = cces12[cces12$donor == 0,]) #0.92
cor.test(~ abs(imputed_CFscore) + knowledge, data = cces12[cces12$donor == 0,]) #0.35


###

# 5. Bootstrap analysis - do CFscores predict voting better? (Figure 3)

# Create vote choice variable
cces12$house_vote_gop <- ifelse(cces12$HouseCand1Party != 'Democratic' & cces12$HouseCand2Party != 'Republican', NA,
                                ifelse(cces12$CC412 == 2, 1,
                                       ifelse(cces12$CC412 == 1, 0, NA)))

# Create df to sample from
cces12$bootstrap <- (complete.cases(cces12$house_vote_gop) & cces12$donor == 0 &
                           complete.cases(cces12$imputed_CFscore) & complete.cases(cces12$pca1) & complete.cases(cces12$ideo5))
cces_bootstrap <- cces12[cces12$bootstrap == T,]


# a. Run bootstrap analysis

# Vectors for results
cf_error <- rep(NA, 500)
pca_error <- rep(NA, 500)
ideo5_error  <- rep(NA, 500)

set.seed(2023)

for (i in 1:500){
  temp_df <- cces_bootstrap[sample(1:NROW(cces_bootstrap), 500),] # random draw
  
  temp.fit <- glm(house_vote_gop ~ ideo5, data = temp_df, family = 'binomial')
  temp.pred <- predict(temp.fit, temp_df, type = 'response')
  temp.pred <- ifelse(temp.pred > .5, 1, 0)
  
  ideo5_error[i] <- 1 - (sum(temp.pred == temp_df$house_vote_gop)/500) # error from ideo5 measure
  
  temp.fit <- glm(house_vote_gop ~ pca1, data = temp_df, family = 'binomial')
  temp.pred <- predict(temp.fit, temp_df, type = 'response')
  temp.pred <- ifelse(temp.pred > .5, 1, 0)
  
  pca_error[i] <- 1 - (sum(temp.pred == temp_df$house_vote_gop)/500) # error from pca measure
  
  temp.fit <- glm(house_vote_gop ~ imputed_CFscore, data = temp_df, family = 'binomial')
  temp.pred <- predict(temp.fit, temp_df, type = 'response')
  temp.pred <- ifelse(temp.pred > .5, 1, 0)
  
  cf_error[i] <- 1 - (sum(temp.pred == temp_df$house_vote_gop)/500) # error from CFscores
}


# b. Organize results into dataframe

results <- as.data.frame(c(ideo5_error, pca_error, cf_error))
names(results) <- 'error_rate'
results$method <- rep(c('Ideological Self-Placement', 'PCA Scaling', 'Imputed CFscores'), each = 500)
results$method <- factor(results$method,levels = c("Ideological Self-Placement", "PCA Scaling", "Imputed CFscores"))


# c. Present results in plot

p_results <- ggplot(results, aes(x = method, y = error_rate, col = method)) + scale_color_manual(values = c(rep("grey80",4))) 
p_results + geom_jitter(position=position_jitter(0.2)) + 
  stat_summary(fun.data=mean_cl_normal, geom="errorbar", col="black") + theme_classic() + coord_flip() +
  theme(text = element_text(size = 15)) + xlab("Method") + ylab("Classification Error Rate") + theme(legend.position = 'none') +
  ggtitle('Figure 3. Imputed CFscores predict US House vote with less error
          than other measures of ideology') + theme(plot.title = element_text(hjust = 0.5)) + theme(axis.text = element_text(size = 18))


# d. Calculate mean error and % improvement with CFscores

mean(cf_error)
mean(cf_error) / mean(pca_error)
mean(cf_error) / mean(ideo5_error)


###

# 6. Compare donor with nearest non-donor in CFscore and PCA common spaces

# a. Create variables for comparison
cces12$pres_vote_gop <- ifelse(cces12$CC410a == 2, 1,
                               ifelse(cces12$CC410a < 5, 0, NA))
cces12$house_vote_gop
cces12$party_id


# b. Create donor and non-donor DFs with no missinginess
can_match <- complete.cases(cces12$imputed_CFscore) & complete.cases(cces12$pca1)

donors <- cces12[cces12$donor == 1 & can_match == T,]
nondonors <- cces12[cces12$donor == 0 & can_match == T,]
names(nondonors) <- paste(names(nondonors), '.nd', sep='')


# c. Match donors on PCA
nondonors <- nondonors[order(nondonors$pca1),] # order nondonors from left to right
pca_match <- match.closest(donors$pca1, nondonors$pca1.nd) # find nondonor with nearest value for donors
pca_df <- cbind(donors, nondonors[pca_match,]) # bind their data together

summary(abs(pca_df$pca1 - pca_df$pca1.nd) / sd(pca_df$pca1)) # avg match off by .00004 SDs


# d. Match donors on CFscore
nondonors <- nondonors[order(nondonors$imputed_CFscore.nd),]
cf_match <- match.closest(donors$true_CFscore, nondonors$imputed_CFscore.nd)
cf_df <- cbind(donors, nondonors[cf_match,])

summary(abs(cf_df$true_CFscore - cf_df$imputed_CFscore.nd) / sd(cf_df$true_CFscore)) # off by .019


# e. Observe behavioral overlap
sum(pca_df$pres_vote_gop == pca_df$pres_vote_gop.nd, na.rm = T) / sum(complete.cases(pca_df$pres_vote_gop) & complete.cases(pca_df$pres_vote_gop.nd))
sum(pca_df$house_vote_gop == pca_df$house_vote_gop.nd, na.rm = T) / sum(complete.cases(pca_df$house_vote_gop) & complete.cases(pca_df$house_vote_gop.nd))
sum(pca_df$party_id == pca_df$party_id.nd, na.rm = T) / sum(complete.cases(pca_df$party_id) & complete.cases(pca_df$party_id.nd))

sum(cf_df$pres_vote_gop == cf_df$pres_vote_gop.nd, na.rm = T) / sum(complete.cases(cf_df$pres_vote_gop) & complete.cases(cf_df$pres_vote_gop.nd))
sum(cf_df$house_vote_gop == cf_df$house_vote_gop.nd, na.rm = T) / sum(complete.cases(cf_df$house_vote_gop) & complete.cases(cf_df$house_vote_gop.nd))
sum(cf_df$party_id == cf_df$party_id.nd, na.rm = T) / sum(complete.cases(cf_df$party_id) & complete.cases(cf_df$party_id.nd))


###

# 6. Save and garbage collection

rm(list=setdiff(ls(), c("cces12")))
save.image('3_output.RData')

write_dta(cces12[,c('V101','imputed_CFscore')], 'Imputed_CFscores.dta')
